#########
## Nature Human Behaviour 
## Leading Countries in Global Science Increasingly Receive More Citations than Other Countries Despite Doing Similar Research.
## https://doi.org/10.1038/s41562-022-01351-5
## Harvard Dataverse (Code and Metadata): https://doi.org/10.7910/DVN/WCOINR 
## Step 0E
## Data: Data_20210905
#########

# At a terminal, run: 
# ml python/3.6.1
# ml py-ipython/6.1.0_py36 python/3.6.1 py-scipy/1.1.0_py36 py-scikit-learn/0.19.1_py36 py-pandas/0.23.0_py36 gcc/10.1.0 py-pytorch/1.4.0_py36
# ml py-numpy/1.17.2_py36
# srun python3 -u Step_X0E_Python3_RR_MAG_Journal_Coverage.py 

#############################
### Time Start
#############################
import time 
start_time = time.time()

#############################
### Modules
#############################
from pyathena import connect
import pandas as pd
import json
import numpy as np
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import everygrams
import gc 
from nltk.stem import *
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
import sys
from collections import Counter
import random
import bz2 
import pickle
#import cPickle
import itertools
import multiprocessing as mp
from os import path
#from importlib import reload
import os 
import random
import time 
import re

import itertools
import psutil

from googletrans import Translator

from rake_nltk import Metric, Rake

import boto3
import smart_open

import goslate
import math

def fastAthenaQuery(input_sql, input_chunksize = None):

	#This function will return a stream of the s3 file.
	# MUCH faster than PyAthena reading a few rows at a time via the API
	""":Return: a Pandas DataFrame of results from a `sql` query executed against AWS Athena."""
	cursor.execute(input_sql)
	#The s3_path should be of the format: '<bucket_name>/<file_path_inside_the_bucket>'
	#This is the full path with credentials:
	complete_s3_path = 'REDACTED' + cursor.output_location.split("REDACTED")[1]

	if input_chunksize == None:
		outputfile = pd.read_csv(smart_open.smart_open(complete_s3_path))
	else:
		outputfile = pd.read_csv(smart_open.smart_open(complete_s3_path),chunksize=input_chunksize)

	# Delete the file from the s3 bucket
	s3 = boto3.resource("REDACTED",
		aws_access_key_id='REDACTED',
		aws_secret_access_key= 'REDACTED')
	obj = s3.Object("REDACTED",
		complete_s3_path)
	obj.delete()

	return outputfile

###############################
### Connections
###############################

cursor = connect(aws_access_key_id='REDACTED',
                 aws_secret_access_key='REDACTED',
                 s3_staging_dir='REDACTED',
                 region_name='REDACTED').cursor()

conn = connect(aws_access_key_id='REDACTED',
                 aws_secret_access_key='REDACTED',
                 s3_staging_dir='REDACTED',
                 region_name='REDACTED')


#############################
### Query
#############################

for year_ in [1980,1990,2000,2010]:

	check_journal_coverage = '''
	select true as returnTrue where EXISTS (SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 'temp_journal_coverage_YEARHERE');
	'''

	drop_journal_coverage = '''
	drop table if exists mag_staging.temp_journal_coverage_YEARHERE;
	'''

	create_table_journal_coverage = '''
	create table mag_staging.temp_journal_coverage_YEARHERE 
	WITH (
	    format = 'TEXTFILE',
	    field_delimiter = '\t'
	  )
	as
	SELECT abc.fieldofstudyid,
	         abc.journalid,
	         abc.displayname,
	         count(*) AS coverage_over_time
	FROM 
	    (SELECT ab.fieldofstudyid,
	         c.journalid,
	         c.displayname,
	         ab.full_year,
	         count(*) AS number_of_papers
	    FROM "mag_data"."journals" c
	    JOIN 
	        (SELECT *
	        FROM "mag_data"."papers" a
	        JOIN 
	            (SELECT *
	            FROM "mag_data"."paperfieldsofstudy") b
	                ON a.paperid = b.paperid) ab
	                ON c.journalid = ab.journalid
	            WHERE ab.full_year>=YEARHERE and ab.full_year<(YEARHERE+10) and ab.full_year<=2017
	            GROUP BY  ab.fieldofstudyid, ab.full_year,c.journalid, c.displayname) abc
	        GROUP BY  abc.fieldofstudyid, abc.journalid, abc.displayname
	'''

	if pd.read_sql(check_journal_coverage.replace("YEARHERE",str(year_)), conn)["returnTrue"].empty == True:
		cursor.execute(create_table_journal_coverage.replace("YEARHERE",str(year_)))

	query_journal_coverage = '''
	SELECT *
	FROM "mag_staging"."temp_journal_coverage_YEARHERE"
	'''


	df_journal_coverage = fastAthenaQuery(query_journal_coverage.replace("YEARHERE",str(year_)),None)
	if year_!=2010:
		df_journal_coverage["Coverage"] = "["+str(year_)+","+str(year_+10)+")"
		Journal_Year_Coverage = "OUTPUT_Python_MAG_Journal_Coverage_From_"+str(year_)+"_to_"+str(year_+10)+".csv.gz"
	else:
		df_journal_coverage["Coverage"] = "["+str(year_)+",2017]"
		Journal_Year_Coverage = "OUTPUT_Python_MAG_Journal_Coverage_From_"+str(year_)+"_to_2017.csv.gz"

	df_journal_coverage.to_csv(Journal_Year_Coverage,index=False,compression='gzip')

	cursor.execute(drop_journal_coverage.replace("YEARHERE",str(year_)))
